I. Library Import
import pandas as pd
import numpy as np
import plotly.offline as py
import plotly
plotly.offline.init_notebook_mode()
plotly.tools.set_credentials_file(username='stfox13', api_key='Y4QUZyKzXSz7t8P209HF')
from plotly import *
from plotly.graph_objs import *
pd.options.display.float_format = '{0:.2f}'.format
import plotly.graph_objs as go
import cufflinks as cf
%matplotlib inline
mapbox_access_token='pk.eyJ1Ijoic3Rmb3gxMyIsImEiOiJjajJnejd6M2cwMDVmMnlsZ3I4aGI5bmwzIn0.03n_RH-9d3mCBRx9TCNaLw'
II. Data Dictionary
| Column Name | Definition | Data Type |
|---|---|---|
| id | record identifier | string |
| date | Date house was sold | datetime |
| price | Price is prediction target | numeric |
| bedrooms | Number of Bedrooms/House | numeric |
| bathrooms | Number of bathrooms/House | numeric |
| sqft_living | square footage of the home | numeric |
| sqft_lot | square footage of the lot | numeric |
| floors | Total floors (levels) in house | numeric |
| waterfront | House which has a view to a waterfront | numeric |
| view | Has been viewed | numeric |
| condition | How good the condition is (Overall) | numeric |
| grade | The perceived state of the home | numeric |
| sqft_above | square footage of house apart from basement | numeric |
| sqft_basement | square footage of the basement | numeric |
| yr_built | Built Year | numeric |
| yr_renovated | Year when house was renovated | numeric |
| zipcode | Zipcode of the home in question | numeric |
| lat | Latitude coordinate | numeric |
| long | Longitude coordinate | numeric |
| sqft_living15 | Living room area in 2015(implies-- some renovations) This might or might not have affected the lotsize area | numeric |
| sqft_lot15 | lotSize area in 2015(implies-- some renovations) | numeric |
Note: all of the other columns are dummy variables / calculated columns / classifications created for deeper analysis.
III. Dataset Import
df = pd.read_csv("/Users/sfox/Documents/GADataScience/Homework/FinalProject/Dataset2/kc_house_data.csv",
skipinitialspace=True,
parse_dates=[1],
dtype={
"id":np.str,
"date":np.str,
"price":np.int_,
"bedrooms":np.int_,
"bathrooms":np.float_,
"sqft_living":np.int_,
"sqft_lot":np.int_,
"floors":np.float_,
"waterfront":np.int_,
"view":np.int_,
"condition":np.int_,
"grade":np.int_,
"sqft_above":np.int_,
"sqft_basement":np.int_,
"yr_built":np.int_,
"yr_renovated":np.int_,
"zipcode":np.str,
"lat":np.float_,
"long":np.float_,
"sqft_living15":np.float_,
"sqft_lot15":np.float_
})
df['decade_built']=df['yr_built'].astype('str').str.slice(start=0,stop=3)+"0"
df['renovated']=np.where(df['yr_renovated']==0,0,1)
df['year']=df['date'].dt.year
df['price_sqft_living'] = df['price'].astype('float')/df['sqft_living'].astype('float')
df['price_sqft_lot'] = df['price'].astype('float')/df['sqft_lot'].astype('float')
IV. Initial Dataset Review
df.head(10)
df.corr()
df.describe()
V. Dataset Creation for Visualizations
priceStats = df.groupby('date').agg({'price':['mean','std']}).reset_index()
upper_bound = go.Scatter(
name='Upper Bound',
x=priceStats['date'],
y=priceStats.price['mean']+priceStats.price['std'],
mode='lines',
marker=dict(color="444"),
line=dict(width=0),
fillcolor='rgba(68, 68, 68, 0.3)',
fill='tonexty' )
trace = go.Scatter(
name='Mean',
x=priceStats['date'],
y=priceStats.price['mean'],
mode='lines',
line=dict(color='rgb(31, 119, 180)'),
fillcolor='rgba(68, 68, 68, 0.3)',
fill='tonexty' )
lower_bound = go.Scatter(
name='Lower Bound',
x=priceStats['date'],
y=priceStats.price['mean']-priceStats.price['std'],
marker=dict(color="444"),
line=dict(width=0),
mode='lines')
data = [lower_bound, trace, upper_bound]
layout = go.Layout(
yaxis=dict(autotick=False,dtick=250000,title='Home Sale Price'),
xaxis=dict(title='Transaction Date'),
title='Average Home Sale Price, King County (WA) <br> Duration: 5/2014 - 5/2015',
showlegend = False)
fig0 = go.Figure(data=data, layout=layout)
priceStatsByDecade = df[['decade_built','zipcode','price','sqft_lot','sqft_living']]
priceStatsByDecade_1900 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1900']
priceStatsByDecade_1910 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1910']
priceStatsByDecade_1920 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1920']
priceStatsByDecade_1930 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1930']
priceStatsByDecade_1940 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1940']
priceStatsByDecade_1950 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1950']
priceStatsByDecade_1960 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1960']
priceStatsByDecade_1970 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1970']
priceStatsByDecade_1980 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1980']
priceStatsByDecade_1990 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1990']
priceStatsByDecade_2000 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '2000']
priceStatsByDecade_2010 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '2010']
trace0 = go.Scatter(
x = priceStatsByDecade_1900['sqft_living'],
y = priceStatsByDecade_1900['price'],
name = '1900',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(171, 171, 171)',
line = dict(
width = 1
)
)
)
trace1 = go.Scatter(
x = priceStatsByDecade_1910['sqft_living'],
y = priceStatsByDecade_1910['price'],
name = '1910',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(174, 117, 117)',
line = dict(
width = 1
)
)
)
trace2 = go.Scatter(
x = priceStatsByDecade_1920['sqft_living'],
y = priceStatsByDecade_1920['price'],
name = '1920',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(179, 0, 0)',
line = dict(
width = 1
)
)
)
trace3 = go.Scatter(
x = priceStatsByDecade_1930['sqft_living'],
y = priceStatsByDecade_1930['price'],
name = '1930',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(56, 0, 250)',
line = dict(
width = 1
)
)
)
trace4 = go.Scatter(
x = priceStatsByDecade_1940['sqft_living'],
y = priceStatsByDecade_1940['price'],
name = '1940',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(52, 148, 52)',
line = dict(
width = 1
)
)
)
trace5 = go.Scatter(
x = priceStatsByDecade_1950['sqft_living'],
y = priceStatsByDecade_1950['price'],
name = '1950',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(39, 111, 111)',
line = dict(
width = 1
)
)
)
trace6 = go.Scatter(
x = priceStatsByDecade_1960['sqft_living'],
y = priceStatsByDecade_1960['price'],
name = '1960',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(220, 219, 66)',
line = dict(
width = 1
)
)
)
trace7 = go.Scatter(
x = priceStatsByDecade_1970['sqft_living'],
y = priceStatsByDecade_1970['price'],
name = '1970',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(229, 173, 0)',
line = dict(
width = 1
)
)
)
trace8 = go.Scatter(
x = priceStatsByDecade_1980['sqft_living'],
y = priceStatsByDecade_1980['price'],
name = '1980',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(229, 88, 0)',
line = dict(
width = 1
)
)
)
trace9 = go.Scatter(
x = priceStatsByDecade_1990['sqft_living'],
y = priceStatsByDecade_1990['price'],
name = '1990',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(189, 171, 171)',
line = dict(
width = 1
)
)
)
trace10 = go.Scatter(
x = priceStatsByDecade_2000['sqft_living'],
y = priceStatsByDecade_2000['price'],
name = '2000',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(81, 0, 74)',
line = dict(
width = 1
)
)
)
trace11 = go.Scatter(
x = priceStatsByDecade_2010['sqft_living'],
y = priceStatsByDecade_2010['price'],
name = '2010',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(0, 111, 111)',
line = dict(
width = 1
)
)
)
data = [trace0, trace1,trace2,trace3,trace4,trace5,trace6,trace7,trace8,trace9,trace10,trace11]
layout = dict(
yaxis = dict(autotick=False,dtick=250000,title='Home Sale Price'),
xaxis = dict(autotick=False,dtick=1000,title='Living Space (Sqft)'),
title = 'Correlation of Living Space (Sqft) to Home Sale Price, King County (WA) <br> Color-Coded By Decade',
)
fig1 = dict(data=data, layout=layout)
trace0 = go.Scatter(
x = priceStatsByDecade_1900['sqft_lot'],
y = priceStatsByDecade_1900['price'],
name = '1900',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(171, 171, 171)',
line = dict(
width = 1
)
)
)
trace1 = go.Scatter(
x = priceStatsByDecade_1910['sqft_lot'],
y = priceStatsByDecade_1910['price'],
name = '1910',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(174, 117, 117)',
line = dict(
width = 1
)
)
)
trace2 = go.Scatter(
x = priceStatsByDecade_1920['sqft_lot'],
y = priceStatsByDecade_1920['price'],
name = '1920',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(179, 0, 0)',
line = dict(
width = 1
)
)
)
trace3 = go.Scatter(
x = priceStatsByDecade_1930['sqft_lot'],
y = priceStatsByDecade_1930['price'],
name = '1930',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(56, 0, 250)',
line = dict(
width = 1
)
)
)
trace4 = go.Scatter(
x = priceStatsByDecade_1940['sqft_lot'],
y = priceStatsByDecade_1940['price'],
name = '1940',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(52, 148, 52)',
line = dict(
width = 1
)
)
)
trace5 = go.Scatter(
x = priceStatsByDecade_1950['sqft_lot'],
y = priceStatsByDecade_1950['price'],
name = '1950',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(39, 111, 111)',
line = dict(
width = 1
)
)
)
trace6 = go.Scatter(
x = priceStatsByDecade_1960['sqft_lot'],
y = priceStatsByDecade_1960['price'],
name = '1960',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(220, 219, 66)',
line = dict(
width = 1
)
)
)
trace7 = go.Scatter(
x = priceStatsByDecade_1970['sqft_lot'],
y = priceStatsByDecade_1970['price'],
name = '1970',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(229, 173, 0)',
line = dict(
width = 1
)
)
)
trace8 = go.Scatter(
x = priceStatsByDecade_1980['sqft_lot'],
y = priceStatsByDecade_1980['price'],
name = '1980',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(229, 88, 0)',
line = dict(
width = 1
)
)
)
trace9 = go.Scatter(
x = priceStatsByDecade_1990['sqft_lot'],
y = priceStatsByDecade_1990['price'],
name = '1990',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(189, 171, 171)',
line = dict(
width = 1
)
)
)
trace10 = go.Scatter(
x = priceStatsByDecade_2000['sqft_lot'],
y = priceStatsByDecade_2000['price'],
name = '2000',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(81, 0, 74)',
line = dict(
width = 1
)
)
)
trace11 = go.Scatter(
x = priceStatsByDecade_2010['sqft_lot'],
y = priceStatsByDecade_2010['price'],
name = '2010',
mode = 'markers',
marker = dict(
size = 5,
color = 'rgb(0, 111, 111)',
line = dict(
width = 1
)
)
)
data = [trace0, trace1,trace2,trace3,trace4,trace5,trace6,trace7,trace8,trace9,trace10,trace11]
layout = dict(
yaxis = dict(range=[0,8000000],autotick=False,dtick=250000,title='Home Sale Price'),
xaxis = dict(title='Lot Space (Sqft)'),
title = 'Correlation of Lot Space (Sqft) to Home Sale Price, King County (WA) <br> Color-Coded By Decade',
)
fig2 = dict(data=data, layout=layout)
sqftPriceByDecade = df.groupby('decade_built').agg({'price_sqft_living':'mean','price_sqft_lot':'mean'}).reset_index()
sqftPriceByDecade
trace0 = go.Scatter(
x=sqftPriceByDecade['price_sqft_living'],
y=sqftPriceByDecade['decade_built'],
mode='markers',
name='Mean Price per Sqft (living)',
marker=dict(
color='rgb(81, 0, 74)',
line=dict(
color='rgba(156, 165, 196, 1.0)',
width=1,
),
symbol='circle',
size=16,
)
)
trace1 = go.Scatter(
x=sqftPriceByDecade['price_sqft_lot'],
y=sqftPriceByDecade['decade_built'],
mode='markers',
name='Mean Price per Sqft (lot)',
marker=dict(
color='rgb(0, 111, 111)',
line=dict(
color='rgba(217, 217, 217, 1.0)',
width=1,
),
symbol='circle',
size=16,
)
)
data = [trace0, trace1]
layout = dict(
yaxis = dict(range=[1890,2020],autotick=False,dtick=10,ticks='outside',zeroline = False,title='Decade Built'),
xaxis = dict(autotick=False,dtick=50,ticks='outside',zeroline = False,title='Mean Price per Sqft'),
title = 'Comparison of Mean Price per Sqft (lot) Versus Mean Price per Sqft (living), King County (WA) <br> Color-Coded By Sqft Classification',
)
fig3 = go.Figure(data=data, layout=layout)
homeImprovementSqft = df[['waterfront','renovated','view','price_sqft_lot','price_sqft_living']]
trace0 = go.Box(
y=homeImprovementSqft['price_sqft_living'],
x=homeImprovementSqft['waterfront'],
name='Price Per Sqft (living)',
marker=dict(
color='rgb(81, 0, 74)'
)
)
trace1 = go.Box(
y=homeImprovementSqft['price_sqft_lot'],
x=homeImprovementSqft['waterfront'],
name='Price Per Sqft (lot)',
marker=dict(
color='rgb(0, 111, 111)'
)
)
data = [trace0,trace1]
layout = go.Layout(
xaxis=dict(
title='Is Waterfront Property<br>(0 = False, 1 = True)'
),
yaxis=dict(
range=[0,2100],
autotick=False,dtick=100,ticks='outside',
title='Price Per Sqft'
),
boxmode='group',
title = 'Box Plot Comparison of Waterfront versus Non-Waterfront Properties, King County (WA) <br> Color-Coded By Sqft Classification',
)
fig4 = go.Figure(data=data, layout=layout)
trace0 = go.Box(
y=homeImprovementSqft['price_sqft_living'],
x=homeImprovementSqft['renovated'],
name='Price Per Sqft (living)',
marker=dict(
color='rgb(81, 0, 74)'
)
)
trace1 = go.Box(
y=homeImprovementSqft['price_sqft_lot'],
x=homeImprovementSqft['renovated'],
name='Price Per Sqft (lot)',
marker=dict(
color='rgb(0, 111, 111)'
)
)
data = [trace0,trace1]
layout = go.Layout(
xaxis=dict(
title='Is Renovated Property<br>(0 = False, 1 = True)'
),
yaxis=dict(
range=[0,2200],
autotick=False,dtick=100,ticks='outside',
title='Price Per Sqft'
),
boxmode='group',
title = 'Box Plot Comparison of Renovated Properties versus Non-Renovated Properties, King County (WA) <br> Color-Coded By Sqft Classification',
)
fig5 = go.Figure(data=data, layout=layout)
trace0 = go.Histogram(
x=df['bathrooms'],
marker=dict(
color='rgb(0, 111, 111)'
)
)
data=[trace0]
layout = go.Layout(
xaxis=dict(
title='Number of Bathrooms'
),
title='Distribution of Bathrooms, King County (WA)'
)
fig6 = go.Figure(data=data, layout=layout)
trace0 = go.Histogram(
x=df['bathrooms'],histnorm='probability',
marker=dict(
color='rgb(0, 111, 111)'
)
)
data=[trace0]
layout = go.Layout(
xaxis=dict(
title='Number of Bathrooms'
),
title='Normalized Distribution of Bathrooms, King County (WA)'
)
fig7 = go.Figure(data=data, layout=layout)
trace0 = go.Histogram(
x=df['bedrooms'],
marker=dict(
color='rgb(81, 0, 74)'
)
)
data=[trace0]
layout = go.Layout(
xaxis=dict(
title='Number of Bedrooms'
),
title='Distribution of Bedrooms, King County (WA)'
)
fig8 = go.Figure(data=data, layout=layout)
trace0 = go.Histogram(
x=df['bedrooms'],
histnorm='probability',
marker=dict(
color='rgb(81, 0, 74)'
)
)
data=[trace0]
layout = go.Layout(
xaxis=dict(
title='Number of Bedrooms'
),
title='Normalized Distribution of Bedrooms, King County (WA)'
)
fig9 = go.Figure(data=data, layout=layout)
lats = list(df.lat)
lons = list(df.long)
prices = list(df.price)
data = Data([
Scattermapbox(
lat=lats,
lon=lons,
mode='markers',
marker=Marker(
size=9
),
text=prices,
)
])
layout = Layout(
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=47.608013,
lon=-122.335167
),
pitch=0,
zoom=7.9
),
title='Home Sales by Location, King County (WA)<br>(Will be expanded for final presentation)'
)
fig10 = dict(data=data, layout=layout)
VI. Visualization Generation
py.iplot(fig0)
py.iplot(fig1)
py.iplot(fig2)
py.iplot(fig3)
py.iplot(fig4)
py.iplot(fig5)
py.iplot(fig6)
py.iplot(fig7)
py.iplot(fig8)
py.iplot(fig9)
py.iplot(fig10)
VII. Next Steps
I requested and received GreatSchools API Access. GreatSchools tracks standardized test scores and parent ratings for public and private schools across the United States. I plan to join information extracted from the GreatSchools site via the API to join the housing information and derive proximity to good, average, and bad schools - classifications derived from the standardized test scores found within the data.
At this time, I am having a bit of trouble joining the two datasets - this will be the focus of my efforts moving forward.